3.3.4 Main analysis (Exploratory Data Analysis)
Dataset 1) Summary Measure of Health
The data is a statistical summary of the survey data including columns such as confidence interval and variance for each factor, however in our project, we are going to only analyze on the medium value of each factors. Each row is a medium observation of a county in one state. -2222.20 and -1111.10 cell value indicates the missing data which has been converted to NA. We visualize the missing pattern in this dataset, variables Healthy_Status and Unhealthy_Days has the most counts of missing values.
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.7
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(dplyr)
summary_measure_df1<- summary_measure %>%
select(State_FIPS_Code,County_FIPS_Code,CHSI_County_Name,CHSI_State_Name,CHSI_State_Abbr,ALE, All_Death, Health_Status, Unhealthy_Days)
summary_measure_df1[summary_measure_df1==-2222.20] <- NA
summary_measure_df1[summary_measure_df1==-1111.10] <- NA
library(extracat)
visna(summary_measure_df1,sort = "b")
We remove the NA’s and take the average over counties’ value for each state. The bar charts visualize the ordering of the amount in all four factors crossing 50 states:
summary_measure_df1 <- summary_measure_df1[complete.cases(summary_measure_df1), ]
summary_measure_state <- summary_measure_df1%>%
group_by(CHSI_State_Abbr) %>%
summarise(meanALE = mean(ALE,rm.na=TRUE), mAD = mean(All_Death),mHS= mean(Health_Status), mUD =mean(Unhealthy_Days))%>%
mutate(meanALE = meanALE, meanAll_Death = mAD, meanHealth_Status = mHS, meanUnhealthy_Days=mUD)
# Average Life Expectancy — This represents the average number of years that a baby born in 1990 is expected to live if current mortality trends continue to apply.
ggplot(summary_measure_state, aes(reorder(CHSI_State_Abbr,meanALE),meanALE))+
geom_bar(stat = "identity",fill='rosybrown')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
xlab("States") +
ylab("Average Value") +
ggtitle(paste("Histogram Visualization for Average Life Expectancy"))
# All_Death: Mortality from any cause is the average annual rate of all causes of death.
ggplot(summary_measure_state, aes(reorder(CHSI_State_Abbr,meanAll_Death),meanAll_Death))+
geom_bar(stat = "identity",fill='rosybrown')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
xlab("States") +
ylab("Average Value") +
ggtitle(paste("Histogram Visualization for All Death"))
ggplot(summary_measure_state, aes(reorder(CHSI_State_Abbr,meanHealth_Status),meanHealth_Status))+
geom_bar(stat = "identity",fill='rosybrown')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
xlab("States") +
ylab("Average Value") +
ggtitle(paste("Histogram Visualization for Self-rated Health Status"))
# The average number of unhealthy days (mental or physical) in the past 30 days, reported by adults age 18 and older is provided,
ggplot(summary_measure_state, aes(reorder(CHSI_State_Abbr,meanUnhealthy_Days),meanUnhealthy_Days))+
geom_bar(stat = "identity",fill='rosybrown')+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
xlab("States") +
ylab("Average Value") +
ggtitle(paste("Histogram Visualization for Unhealthy Days"))
# Output the cleaned datafile
# write.csv(summary_measure_state, file = "summary_measure_state.csv", row.names = FALSE)
The Cleveland plots provides another way to visualize the ordering of the amount in all four factors crossing 50 states:
summary_measure_state2 <- gather(summary_measure_state, key = variable, value=Value, meanALE)
summary_measure_state3 <- gather(summary_measure_state, key = variable, value=Value, meanAll_Death)
summary_measure_state4 <- gather(summary_measure_state, key = variable, value=Value, meanUnhealthy_Days)
summary_measure_state5 <- gather(summary_measure_state, key = variable, value=Value, meanHealth_Status)
ggplot(summary_measure_state2, aes(x=Value, y = fct_reorder2(CHSI_State_Abbr, fct_rev(variable),-Value)))+
geom_point(aes(col=variable))+scale_color_manual("Variables", values = c("rosybrown"))+
theme(axis.text.x = element_text(hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
ylab("States") +
xlab("Average Value") +
ggtitle(paste("Cleveland for Average Life Expectancy"))
ggplot(summary_measure_state3, aes(x=Value, y = fct_reorder2(CHSI_State_Abbr, fct_rev(variable),-Value)))+
geom_point(aes(col=variable))+scale_color_manual("Variables", values = c( "rosybrown"))+
theme(axis.text.x = element_text(hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
ylab("States") +
xlab("Average Value") +
ggtitle(paste("Cleveland for All Death"))
ggplot(summary_measure_state4, aes(x=Value, y = fct_reorder2(CHSI_State_Abbr, fct_rev(variable),-Value)))+
geom_point(aes(col=variable))+scale_color_manual("Variables", values = c("rosybrown"))+
theme(axis.text.x = element_text(hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
ylab("States") +
xlab("Average Value") +
ggtitle(paste("Cleveland for Self-rated Health Status"))
ggplot(summary_measure_state5, aes(x=Value, y = fct_reorder2(CHSI_State_Abbr, fct_rev(variable),-Value)))+
geom_point(aes(col=variable))+scale_color_manual("Variables", values = c("rosybrown")) +
theme(axis.text.x = element_text(hjust = 1, size = 9),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14)
)+
ylab("States") +
xlab("Average Value") +
ggtitle(paste("Cleveland for Unhealthy Days"))
Main conclusion from bar chars/Cleveland plots:
Washintong, D.C has the shortest ALE value while Hawaii state has the largest value of ALE. The variance of
ALEis quite small as it ranges from 72 years to 79.47 years.Plots for
Unhealthy DaysandAll Deathhave consistent finding where Hawaii has the smallest value. West Virgina has the largest value of unhealthy days, and Mississippi has the largest value of all deaths.However, the plots for self-rated
Healthy Statusshows insteresting results where the states with larger value ofUnhealthy Daystends to have a higher rating for theirHealth Status.